#!/bin/bash

export BASE_DATA_PATH="/mnt/semiterra/mixagol/gene_annotation"
export BLAST_DB_PATH="${BASE_DATA_PATH}/1_databases/blast_db/GENOMES_DB"
export SORT_BIN=/home/mixagol/coreutils-8.13/src/sort

export LC_ALL=C
export GENOMES_NUM=`cat "${BASE_DATA_PATH}/3_normed_matrix/map_int_genom.txt" | wc -l`
g++ -O3 4_annotation/intersect_genes.cpp -o 4_annotation/intersect_genes

export ANNOTATE_DIR="${BASE_DATA_PATH}/4_annotation/annot_$(date +%s)"
mkdir -p $ANNOTATE_DIR

cat genes.faa > $ANNOTATE_DIR/genes.faa

cat $ANNOTATE_DIR/genes.faa | 2_raw_matrix/run_blast.sh $BLAST_DB_PATH > $ANNOTATE_DIR/blast_results.txt

for E_VALUE in 0.05 0.01 0.001 0.0001 0.00001 0.000001; do

    export E_DIR=$ANNOTATE_DIR/e_$E_VALUE
    mkdir -p $E_DIR

    cat $ANNOTATE_DIR/blast_results.txt | awk -F'\t' '{if ($11<='$E_VALUE') print $1"\t"$2}' \
        | 3_normed_matrix/apply_maps.py \
            --genomes-map ${BASE_DATA_PATH}/3_normed_matrix/map_int_genom.txt \
        | 3_normed_matrix/create_matrix.py \
        > $E_DIR/vectors.txt
    
    4_annotation/intersect_genes \
        -m ${BASE_DATA_PATH}/3_normed_matrix/matrix_${E_VALUE}_.txt \
        -v $E_DIR/vectors.txt \
        -g $GENOMES_NUM \
        > $E_DIR/joined_raw.txt
    
    join -t$'\t' -1 1 -2 2 \
        <($SORT_BIN -t$'\t' -k1,1 -S100M ${BASE_DATA_PATH}/3_normed_matrix/map_int_gene.txt) \
        <($SORT_BIN -t$'\t' -k2,2 -S100M ${E_DIR}/joined_raw.txt) \
        | awk -F'\t' '{print $3"\t"$2"\t"$4}' \
        | sort \
        > ${E_DIR}/joined.txt
    
    for GENE_ID in `cut -f1 $E_DIR/vectors.txt`; do
        join -t$'\t' -1 2 -2 1 \
            <(cat ${E_DIR}/joined.txt | awk -F'\t' '{if ($1 == "'"$GENE_ID"'") print $0}' | $SORT_BIN -t$'\t' -k2,2 -S100M) \
            <(cat ${BASE_DATA_PATH}/1_databases/genes_info.txt | cut -f1,4) \
            | awk -F'\t' '{print $3"\t"$4}' \
            | $SORT_BIN -t$'\t' -g -k1,1 -S100M \
            | $SORT_BIN -t$'\t' -k2,2 -u -s -S100M \
            | $SORT_BIN -t$'\t' -g -k1,1 -S100M \
            > ${E_DIR}/result_${GENE_ID}_full.txt
        cat ${E_DIR}/result_${GENE_ID}_full.txt | head -n1000  > ${E_DIR}/result_${GENE_ID}_top1000.txt
    done

done
